{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from datetime import timedelta, datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据路径和存储路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data1_path = \"data/train.csv\"\n",
    "data2_path = \"data/test.csv\"\n",
    "data1_to_csv_path = \"data/train_clean.csv\"\n",
    "data2_to_csv_path = \"data/test_clean.csv\"\n",
    "json_path = \"data/index_json.json\"\n",
    "\n",
    "json_dict = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data1 = pd.read_csv(data1_path)\n",
    "data2 = pd.read_csv(data2_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 删除部分特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def del_features(data):\n",
    "    del_list = [\"sid\", \"ver\", \"province\", \"idfamd5\", \"make\", \"os\", \"reqrealip\"]\n",
    "    for i in del_list:\n",
    "        del data[i]\n",
    "\n",
    "    print(\"del ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del_features(data1)\n",
    "del_features(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 连续值特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def nginxtime(data):\n",
    "\tdata['datetime'] = pd.to_datetime(data['nginxtime'] / 1000, unit='s') + \\\n",
    "\t\t\t\t\t   timedelta(hours=8)\n",
    "\tdata['hour'] = data['datetime'].dt.hour\n",
    "\tdata['day'] = data['datetime'].dt.day - data['datetime'].dt.day.min()\n",
    "\tdata['minute'] = data['datetime'].dt.minute.astype('uint8')\n",
    "\tdata[\"time\"] = data['day'] * 24 * 60 + data['hour'] * 60 + data['minute']\n",
    "\n",
    "\tfor i in [\"datetime\", \"hour\", \"day\", \"minute\"]:\n",
    "\t\tdel data[i]\n",
    "\n",
    "\tprint(\"nginxtime ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nginxtime(data1)\n",
    "nginxtime(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def h_w_ppi(data):\n",
    "\tnew_h = data[\"h\"] + 0.1\n",
    "\tnew_w = data[\"w\"] + 0.1\n",
    "\tnew_ppi = data[\"ppi\"] + 0.1\n",
    "\tresolution_ratio = new_h * new_w * new_ppi\n",
    "\n",
    "\tdel data[\"h\"]\n",
    "\tdel data[\"w\"]\n",
    "\tdel data[\"ppi\"]\n",
    "\n",
    "\tdata[\"resolution_ratio\"] = resolution_ratio\n",
    "\tprint(\"h_w_ppi ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "h_w_ppi(data1)\n",
    "h_w_ppi(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ip(data):\n",
    "\t# 从网上爬取的ip对应城市的json文件导入\n",
    "\twith open(\"data/ip_index.json\", \"r\") as f:\n",
    "\t\tip_index = json.load(f)\n",
    "\n",
    "\tnew_ip = []\n",
    "\tfor n, i in enumerate(data[\"ip\"]):\n",
    "\t\ttry:\n",
    "\t\t\tif ip_index[i] in data[\"city\"][n]:\n",
    "\t\t\t\tnew_ip.append(0)\n",
    "\t\t\telse:\n",
    "\t\t\t\tnew_ip.append(1)\n",
    "\t\texcept:\n",
    "\t\t\tnew_ip.append(1)\n",
    "\n",
    "\tdata[\"ip\"] = new_ip\n",
    "\tprint(\"ip ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ip(data1)\n",
    "ip(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# one-hot特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dvctype(data):\n",
    "\tnew_dvctype = []\n",
    "\tfor i in data[\"dvctype\"]:\n",
    "\t\tif i == 3:\n",
    "\t\t\ti = 1\n",
    "\t\tnew_dvctype.append(i)\n",
    "\n",
    "\tdata[\"dvctype\"] = new_dvctype\n",
    "\tprint(\"dvctype ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dvctype(data1)\n",
    "dvctype(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def orientation(data):\n",
    "\tnew_orientation = []\n",
    "\tfor i in data[\"orientation\"]:\n",
    "\t\tif i == 90:\n",
    "\t\t\ti = 2\n",
    "\t\tnew_orientation.append(i)\n",
    "\n",
    "\tdata[\"orientation\"] = new_orientation\n",
    "\tprint(\"orientation ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "orientation(data1)\n",
    "orientation(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def apptype(data1, data2):\n",
    "\tglobal json_dict\n",
    "\n",
    "\tdef process(data):\n",
    "\t\tnew_apptype = []\n",
    "\t\tfor i in data[\"apptype\"]:\n",
    "\t\t\ttry: \n",
    "\t\t\t\tnew_apptype.append(apptype_set.index(i))\n",
    "\t\t\texcept:\n",
    "\t\t\t\tnew_apptype.append(len(apptype_set))\n",
    "\t\tdata[\"apptype\"] = new_apptype\n",
    "\n",
    "\tapptype_set = list(set(data1[\"apptype\"]) & set(data2[\"apptype\"]))\n",
    "\tjson_dict[\"apptype\"] = apptype_set\n",
    "\n",
    "\tprocess(data1)\n",
    "\tprocess(data2)\n",
    "\tprint(\"apptype ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "apptype(data1, data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def carrier(data1, data2):\n",
    "\tglobal json_dict\n",
    "\n",
    "\tdef process(data):\n",
    "\t\tnew_carrier = []\n",
    "\t\tfor i in data[\"carrier\"]:\n",
    "\t\t\tnew_carrier.append(carrier_set.index(i))\n",
    "\t\tdata[\"carrier\"] = new_carrier\n",
    "\n",
    "\tcarrier_set = list(set(data1[\"carrier\"]) & set(data2[\"carrier\"]))\n",
    "\tjson_dict[\"carrier\"] = carrier_set\n",
    "\n",
    "\tprocess(data1)\n",
    "\tprocess(data2)\n",
    "\tprint(\"carrier ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "carrier(data1, data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lan(data1, data2):\n",
    "\tglobal json_dict\n",
    "\n",
    "\tdef process_1(data):\n",
    "\t\tnew_lan = []\n",
    "\t\tfor i in data[\"lan\"]:\n",
    "\t\t\ttry:\n",
    "\t\t\t\tfor j in i:\n",
    "\t\t\t\t\tif j in lan_del_string:\n",
    "\t\t\t\t\t\ti = i.replace(j,\"\")\n",
    "\n",
    "\t\t\t\ti = i.lower()\n",
    "\t\t\texcept:\n",
    "\t\t\t\tpass\n",
    "\t\t\tfinally:\n",
    "\t\t\t\tnew_lan.append(i)\n",
    "\n",
    "\t\treturn new_lan\n",
    "\n",
    "\tdef process_2(data, new_lan):\n",
    "\t\tfor n, i in enumerate(new_lan):\n",
    "\t\t\ttry:\n",
    "\t\t\t\tnew_lan[n] = lan_set.index(i)\n",
    "\t\t\texcept:\n",
    "\t\t\t\tnew_lan[n] = len(lan_set)\n",
    "\t\tdata[\"lan\"] = new_lan\n",
    "\n",
    "\tlan_del_string = [\"_\", \"-\"]\n",
    "\n",
    "\tnew_lan_1 = process_1(data1)\n",
    "\tnew_lan_2 = process_1(data2)\n",
    "\n",
    "\tlan_set = list(set(new_lan_1) & set(new_lan_2))\n",
    "\tjson_dict[\"lan\"] = lan_set\n",
    "\n",
    "\tprocess_2(data1, new_lan_1)\n",
    "\tprocess_2(data2, new_lan_2)\n",
    "\tprint(\"lan ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lan(data1, data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# embedding特征数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model(data):\n",
    "\tmodel_del_string = [\" \", \"_\", \",\", \"+\", \"/\", \"-\", \"%\", \"(\", \")\", \".\"]\n",
    "\tspecial_type = {'PACM00': \"OPPO R15\", 'PBAM00': \"OPPO A5\", \\\n",
    "\t\t\t\t\t'PBEM00': \"OPPO R17\", 'PADM00': \"OPPO A3\", \\\n",
    "\t\t\t\t\t'PBBM00': \"OPPO A7\", 'PAAM00': \"OPPO R15_1\", \\\n",
    "\t\t\t\t\t'PACT00': \"OPPO R15_2\", 'PABT00': \"OPPO A5_1\", \\\n",
    "\t\t\t\t\t'PBCM10': \"OPPO R15x\"}\n",
    "\tnew_model = []\n",
    "\n",
    "\t# 把一些定制手机的型号转换成一般形式\n",
    "\tfor i in special_type:\n",
    "\t\tdata['model'].replace(i, special_type[i], inplace=True)\n",
    "\n",
    "\tfor i in data[\"model\"]:\n",
    "\t\ttry:\n",
    "\t\t\tfor j in i:\n",
    "\t\t\t\tif j in model_del_string:\n",
    "\t\t\t\t\ti = i.replace(j,\"\")\n",
    "\n",
    "\t\t\ti = i.lower()\n",
    "\t\t\tif \"huaweihuawei\" in i:\n",
    "\t\t\t\ti = i.replace(\"huaweihuawei\",\"huawei\")\n",
    "\t\t\tif \"xiaomixiaomi\" in i:\n",
    "\t\t\t\ti = i.replace(\"xiaomixiaomi\",\"xiaomi\")\n",
    "\t\texcept:\n",
    "\t\t\tpass\n",
    "\t\tfinally:\n",
    "\t\t\tnew_model.append(i)\n",
    "\n",
    "\tdata[\"model\"] = new_model\n",
    "\tprint(\"model ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model(data1)\n",
    "model(data2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def osv(data):\n",
    "\tnew_osv = []\n",
    "\tfor i in data[\"osv\"]:\n",
    "\t\ttry:\n",
    "\t\t\tif \",\" in i:\n",
    "\t\t\t\ti = i.replace(\",\", \".\")\n",
    "\t\t\tif \" 十核2.0G_HD\" in i:\n",
    "\t\t\t\ti = i.replace(\" 十核2.0G_HD\", \"\")\n",
    "\n",
    "\t\t\tprocess = i.split(\".\")\n",
    "\t\t\twhile process[-1] == \"0\":\n",
    "\t\t\t\tdel process[-1]\n",
    "\t\t\ti = \".\".join(process)\n",
    "\t\texcept:\n",
    "\t\t\tpass\n",
    "\t\tfinally:\n",
    "\t\t\tnew_osv.append(i)\n",
    "\n",
    "\tdata[\"osv\"] = new_osv\n",
    "\tprint(\"osv ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "osv(data1)\n",
    "osv(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# embedding特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def embedding(data1, data2):\n",
    "\tglobal json_dict\n",
    "\n",
    "\tdef build_index(n, data, feature):\n",
    "\t\tnew_feature = []\n",
    "\t\tfor i in data[feature]:\n",
    "\t\t\ttry:\n",
    "\t\t\t\tnew_feature.append(embedding_set.index(i))\n",
    "\t\t\texcept:\n",
    "\t\t\t\tnew_feature.append(len(embedding_set) + n)\n",
    "\n",
    "\t\treturn new_feature\n",
    "\n",
    "\tembedding_index = [\"pkgname\", \"adunitshowid\", \"mediashowid\", \"city\", \\\n",
    "\t\t\t\t\t   \"adidmd5\", \"imeimd5\",\"openudidmd5\", \"macmd5\", \\\n",
    "\t\t\t\t\t   \"model\", \"osv\"]\n",
    "\tembedding_set = []\n",
    "\n",
    "\tfor i in embedding_index:\n",
    "\t\tembedding_set += list(set(data1[i]) & set(data2[i]))\n",
    "\n",
    "\tembedding_set = list(set(embedding_set))\n",
    "\tjson_dict[\"embedding\"] = embedding_set\n",
    "\n",
    "\tfor n, i in enumerate(embedding_index):\n",
    "\t\tdata1[i] = build_index(n, data1, i)\n",
    "\t\tdata2[i] = build_index(n, data2, i)\n",
    "\t\tprint(\"{} is ok\".format(i))\n",
    "\n",
    "\tprint(\"embedding ok\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding(data1, data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 交换列顺序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_col_index(data):\n",
    "\tcol_index = [\"label\", \"time\", \"ip\", \"resolution_ratio\", \"apptype\", \\\n",
    "\t\t\t\t\"dvctype\", \"ntt\", \"carrier\", \"orientation\", \"lan\", \"pkgname\", \\\n",
    "\t\t\t\t\"adunitshowid\", \"mediashowid\", \"city\", \"adidmd5\", \\\n",
    "\t\t\t\t\"imeimd5\",\"openudidmd5\", \"macmd5\", \"model\", \"osv\"]\n",
    "\tif data is data2:\n",
    "\t\tdel col_index[0]\n",
    "\n",
    "\treturn data[col_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "change_col_index(data1)\n",
    "change_col_index(data2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data1.to_csv(data1_to_csv_path, index=False)\n",
    "data2.to_csv(data2_to_csv_path, index=False)\n",
    "with open(json_path,\"w\") as f:\n",
    "    json.dump(json_dict, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
